None Recommendation
In [92]:
import pandas as pd

import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import plotly.express as px
from sklearn.decomposition import PCA
import plotly.offline
plotly.offline.init_notebook_mode()
from IPython.display import IFrame
In [2]:
linkdf = pd.read_csv("links.csv")
moviedf = pd.read_csv("movies_metadata.csv")
ratingdf = pd.read_csv("ratings.csv")

Datasets

  • links.csv : the relation between tmdb id, imdb id and the id in rating dataset
  • movie_metadata.csv : movie informations
  • ratings.csv : the rating dataset #### Notice:
  • Be aware of the link dataset
  • the id in ratings data need to be convert to imdb id
In [3]:
dftmdb = pd.DataFrame({"Id":ratingdf.groupby("movieId").rating.count().keys(),
                       "nrating":ratingdf.groupby("movieId").rating.count(),
                       "avgrating":ratingdf.groupby("movieId").rating.mean()})
In [4]:
# let us pick the movie with more than 1000 ratings
pickmovieId = dftmdb[dftmdb.nrating>1000].Id.unique()
sns.displot(dftmdb[dftmdb.nrating>1000], x="avgrating")
print("number of movies we choose (criterion: more than 1000 ratings):", len(dftmdb[dftmdb.nrating>1000]) )
number of movies:
In [5]:
moviedf["date"] = pd.to_datetime(moviedf.release_date, format = "%Y-%m-%d",\
                    errors="coerce")
moviedf["year"] = moviedf.date.dt.year
moviedf["month"] = moviedf.date.dt.month
In [6]:
linkdfpick = linkdf[linkdf.movieId.isin(pickmovieId)]
In [7]:
#linkdfpick["title"]=linkdfpick.imdbId.apply(lambda x:moviedf.title[moviedf.imdb_id == "tt{:07d}".format(x)].values[0])
a = list(pickmovieId)
for i,j in linkdfpick.iterrows():
    if (len(moviedf.title[moviedf.imdb_id == "tt{:07d}".format(int(j.imdbId))])) < 1:
        n = a.index(int(j.movieId))
        del a[n]
pickmovieId = np.array(a)
In [8]:
linkdfpick = linkdf[linkdf.movieId.isin(pickmovieId)]
linkdfpick["title"]=linkdfpick.imdbId.apply(lambda x:moviedf.title[moviedf.imdb_id == "tt{:07d}".format(x)].values[0])
linkdfpick
Out[8]:
movieId imdbId tmdbId title
0 1 114709 862.0 Toy Story
1 2 113497 8844.0 Jumanji
2 3 113228 15602.0 Grumpier Old Men
3 4 114885 31357.0 Waiting to Exhale
4 5 113041 11862.0 Father of the Bride Part II
... ... ... ... ...
41850 166528 3748528 330459.0 Rogue One: A Star Wars Story
41897 166635 1355644 274870.0 Passengers
41901 166643 4846340 381284.0 Hidden Figures
42535 168250 5052448 419430.0 Get Out
42536 168252 3315342 263115.0 Logan

3733 rows × 4 columns

In [9]:
ratingdfpick = ratingdf[ratingdf.movieId.isin(pickmovieId)]
In [10]:
user_review_num = ratingdfpick.groupby("userId").movieId.count().to_dict()
pickuserId = [i for i in user_review_num if user_review_num[i] > 10]
ratingdfpick = ratingdfpick[ratingdfpick.userId.isin(pickuserId)]
In [11]:
userold2new = {j:i for i,j in enumerate(ratingdfpick.userId.unique())}
usernew2old = {i:j for i,j in enumerate(ratingdfpick.userId.unique())}
ratingdfpick.userId = ratingdfpick.userId.apply(lambda x: userold2new[x])
ratingdfpick.head()
Out[11]:
userId movieId rating timestamp
0 0 110 1.0 1425941529
1 0 147 4.5 1425942435
2 0 858 5.0 1425941523
3 0 1221 5.0 1425941546
4 0 1246 5.0 1425941556
In [12]:
movieold2new = {j:i for i,j in enumerate(ratingdfpick.movieId.unique())}
movienew2old = {i:j for i,j in enumerate(ratingdfpick.movieId.unique())}
ratingdfpick.movieId = ratingdfpick.movieId.apply(lambda x: movieold2new[x])
ratingdfpick.head()
Out[12]:
userId movieId rating timestamp
0 0 0 1.0 1425941529
1 0 1 4.5 1425942435
2 0 2 5.0 1425941523
3 0 3 5.0 1425941546
4 0 4 5.0 1425941556
In [13]:
ratingdfpick["rating_norm"] = (ratingdfpick.rating - ratingdfpick.rating.mean())/    ratingdfpick.rating.var()
#ratingdfpick.to_csv("ratings_reduced.csv")
In [14]:
df_train, df_test = train_test_split(ratingdfpick, stratify = ratingdfpick.userId, test_size = 0.2)
In [15]:
print(df_train.info())
print("# of users:", len(df_train.userId.unique()))
print("# of movies:", len(df_train.movieId.unique()))
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18438513 entries, 14826831 to 9835121
Data columns (total 5 columns):
 #   Column       Dtype  
---  ------       -----  
 0   userId       int64  
 1   movieId      int64  
 2   rating       float64
 3   timestamp    int64  
 4   rating_norm  float64
dtypes: float64(2), int64(3)
memory usage: 844.0 MB
None
# of users: 223419
# of movies: 3733

Model for latent factor

$$ \hat{r}_{mu} = V_u W_m^T $$

We assume that the movie and user can be represented by vectors such that the predicted rating $\hat{r}_{mu}$, the user $u$'s rating on the movie $m$, is the inner product of $V_u$ and $W_m$.

The vector model could be trained by minimizing the square error $$ \sum_{mu} (\hat{r}_{mu} - r_{mu})^2. $$

Methods

  • ALS
  • SGD : Stochastic gradient decent. We choose this method
In [50]:
def SGD_without_bias(dftrain, dftest, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
    Nusers = len(dftrain.userId.unique())
    Nmovies = len(dftrain.movieId.unique())
    if vec_users is None:
        vec_users = np.zeros((Nusers, Ndf))
    if vec_movies is None:
        vec_movies = np.random.randn(Ndf, Nmovies)
    else:
        vec_movies = vec_movies.T
    train_loss = []
    test_loss = []
    userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
    userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
    for epoch in range(Nepoch):
        moviepermu = np.random.permutation(Nmovies)
        trainloss = 0
        for mg in moviepermu:
            userIds = userIds_list_train[mg]
            scores = scores_list_train[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                - scores.values)
            step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
            vec_movies[:,mg] -= np.clip(step * learning_rate, -0.5, 0.5)
            vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.7,0.7)
            trainloss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        train_loss.append(trainloss / len(dftrain))
        loss = 0
        for mg in moviepermu:
            userIds = userIds_list_test[mg]
            scores = scores_list_test[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                - scores.values)
            loss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
        test_loss.append(loss / len(dftest))
    return vec_movies.T, vec_users, train_loss, test_loss
In [51]:
def SGD_with_onlybias(dftrain, dftest, bias_movies = None, bias_users = None, learning_rate = 0.001, Nepoch = 10, Ndf = 3):
    Nusers = len(dftrain.userId.unique())
    Nmovies = len(dftrain.movieId.unique())
    if bias_users is None:
        bias_users = np.zeros(Nusers)
    if bias_movies is None:
        bias_movies = np.zeros(Nmovies)
    train_loss = []
    test_loss = []
    userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
    userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
    for epoch in range(Nepoch):
        moviepermu = np.random.permutation(Nmovies)
        trainloss = 0
        for mg in moviepermu:
            userIds = userIds_list_train[mg]
            scores = scores_list_train[mg]
            res = (bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            bias_movies[mg] -= np.clip(learning_rate * res.sum(), -0.5, 0.5)
            bias_users[userIds] -= np.clip(learning_rate * res, -0.7, 0.7)
            trainloss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        train_loss.append(trainloss / len(dftrain))
        loss = 0
        for mg in moviepermu:
            userIds = userIds_list_test[mg]
            scores = scores_list_test[mg]
            res = (bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            loss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
        test_loss.append(loss / len(dftest))
    return bias_movies, bias_users, train_loss, test_loss
In [52]:
def SGD_with_fixedbias(dftrain, dftest, bias_movies, bias_users, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
    Nusers = len(dftrain.userId.unique())
    Nmovies = len(dftrain.movieId.unique())
    if vec_users is None:
        vec_users = np.zeros((Nusers, Ndf))
    if vec_movies is None:
        vec_movies = np.random.randn(Ndf, Nmovies)
    else:
        vec_movies = vec_movies.T
    train_loss = []
    test_loss = []
    userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
    userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
    for epoch in range(Nepoch):
        moviepermu = np.random.permutation(Nmovies)
        trainloss = 0
        for mg in moviepermu:
            userIds = userIds_list_train[mg]
            scores = scores_list_train[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
            vec_movies[:,mg] -= np.clip(step * learning_rate, -0.5, 0.5)
            vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.7,0.7)
            trainloss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        train_loss.append(trainloss / len(dftrain))
        loss = 0
        for mg in moviepermu:
            userIds = userIds_list_test[mg]
            scores = scores_list_test[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            loss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
        test_loss.append(loss / len(dftest))
    return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
In [53]:
def SGD_with_bias(dftrain, dftest, bias_movies = None, bias_users = None, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
    Nusers = len(dftrain.userId.unique())
    Nmovies = len(dftrain.movieId.unique())
    if bias_users is None:
        bias_users = np.zeros(Nusers)
    if bias_movies is None:
        bias_movies = np.zeros(Nmovies)
    if vec_users is None:
        vec_users = np.zeros((Nusers, Ndf))
    if vec_movies is None:
        vec_movies = np.random.randn(Ndf, Nmovies)
    else:
        vec_movies = vec_movies.T
    train_loss = []
    test_loss = []
    userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
    userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
    for epoch in range(Nepoch):
        moviepermu = np.random.permutation(Nmovies)
        trainloss = 0
        for mg in moviepermu:
            userIds = userIds_list_train[mg]
            scores = scores_list_train[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            bias_movies[mg] -= np.clip(learning_rate * res.sum(), -0.2, 0.2)
            bias_users[userIds] -= np.clip(learning_rate * res, -0.3, 0.3)
            step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
            vec_movies[:,mg] -= np.clip(step * learning_rate, -0.3, 0.3)
            vec_users[userIds, :] -= np.clip((vec_movies[:,mg:mg+1] * res).T * learning_rate,-0.4,0.4)
            trainloss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        train_loss.append(trainloss / len(dftrain))
        loss = 0
        for mg in moviepermu:
            userIds = userIds_list_test[mg]
            scores = scores_list_test[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            loss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
        test_loss.append(loss / len(dftest))
    return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
In [91]:
def ALS_with_bias(dftrain, dftest, bias_movies, bias_users, vec_movies = None, vec_users = None, learning_rate = 0.002, Nepoch = 10, Ndf = 3):
    Nusers = len(dftrain.userId.unique())
    Nmovies = len(dftrain.movieId.unique())
    if vec_users is None:
        vec_users = np.zeros((Nusers, Ndf))
    if vec_movies is None:
        vec_movies = np.random.randn(Ndf, Nmovies)
    else:
        vec_movies = vec_movies.T
    train_loss = []
    test_loss = []
    userIds_list_train = [df_train[dftrain.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_train = [df_train[dftrain.movieId == mg].rating_norm for mg in range(Nmovies)]
    userIds_list_test = [df_test[dftest.movieId == mg].userId for mg in range(Nmovies)]
    scores_list_test = [df_test[dftest.movieId == mg].rating_norm for mg in range(Nmovies)]
    for epoch in range(Nepoch):
        grad_vec_movies = np.zeros_like(vec_movies)
        grad_vec_users = np.zeros_like(vec_users)
        grad_bias_movies = np.zeros_like(bias_movies)
        grad_bias_users = np.zeros_like(bias_users)
        moviepermu = np.random.permutation(Nmovies)
        trainloss = 0
        for mg in moviepermu:
            userIds = userIds_list_train[mg]
            scores = scores_list_train[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            step = (vec_users[userIds, :] * res.reshape(-1,1)).sum(axis = 0)
            grad_bias_movies[mg] += learning_rate * res.sum()
            grad_bias_users[userIds] += learning_rate * res
            grad_vec_movies[:,mg] += step * learning_rate
            grad_vec_users[userIds, :] += (vec_movies[:,mg:mg+1] * res).T * learning_rate
            trainloss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        bias_movies -= grad_bias_movies
        bias_users -= grad_bias_users
        vec_movies -= grad_vec_movies
        vec_users -= grad_vec_users
        
        train_loss.append(trainloss / len(dftrain))
        loss = 0
        for mg in moviepermu:
            userIds = userIds_list_test[mg]
            scores = scores_list_test[mg]
            res = ((vec_users[userIds, :] @  vec_movies[:, mg:mg+1]).ravel()\
                + bias_movies[mg]\
                + bias_users[userIds]\
                - scores.values)
            loss += (res**2).sum()
        #pre = (((vec_users[i:i+1, :] @ vec_movies[:, movieIds]).ravel() - scores.values)**2).mean()
        print("\r train loss:", trainloss / len(dftrain), "test loss:", loss / len(dftest), end = '')
        test_loss.append(loss / len(dftest))
    return vec_movies.T, vec_users, bias_movies, bias_users, train_loss, test_loss
In [55]:
outnew3 = SGD_without_bias(df_train,df_test, Nepoch = 120, learning_rate = 0.001, Ndf=4)
 train loss: 0.5143722773736675 test loss: 0.5711242099451379

Use PCA to rotate the vectors

In [56]:
pca = PCA()
pca.fit(outnew3[0].copy())
print(pca.explained_variance_)
[2.95354196 2.04999166 1.56102894 0.03377403]
In [57]:
movie_vec_pca = outnew3[0] @ pca.components_.T
user_vec_pca = outnew3[1] @ pca.components_.T
movie_vec_pca[:,0] *= -1
user_vec_pca[:,0] *= -1
In [58]:
print("user vectors")
print("variance:", user_vec_pca.var(axis=0), "mean:", user_vec_pca.mean(axis=0))
print("movie vectors")
print("variance:", movie_vec_pca.var(axis=0),"mean:", movie_vec_pca.mean(axis=0))
user vectors
variance: [0.03171025 0.01941858 0.01810709 0.06685869] mean: [ 0.21487323 -0.0196491  -0.04450046  0.10266513]
movie vectors
variance: [2.95275077 2.0494425  1.56061077 0.03376498] mean: [ 0.10634077  0.03189689  0.10294954 -1.67656792]
In [59]:
movie_factor = pd.DataFrame(movie_vec_pca, columns = ["comp_1", "comp_2", "comp_3", "comp_4"])
movie_factor["movieId"] = movie_factor.index.map(lambda x: movienew2old[x])
movie_factor["title"] = movie_factor["movieId"].apply(lambda x: linkdfpick.title[linkdfpick.movieId == x].values[0])
In [95]:
fig = px.scatter_3d(movie_factor, x="comp_1", y="comp_2", z="comp_3", hover_name="title",color = "comp_1")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias3D.html")
IFrame(src='./NoBias3D.html', width=800, height=700)
Out[95]:
In [96]:
fig = px.scatter(movie_factor, x="comp_2", y="comp_3", hover_name="title",color = "comp_1")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias2D.html")
IFrame(src='./NoBias2D.html', width=800, height=700)
Out[96]:

Discussion

We use four components for the latent factor vector. Actually, after PCA, the 1st component (largest variance) is almost the average rating of the movie. The 2nd and 3rd component represents the property of movies. The last component is almost a constant for movies, but it has large variance for user vectors. It implies that the last component represent the average rating of a user. Therefore, the first and the last components are actually the bias of movies and the bias of users respectively. As follows, I also train model with bias and two-component vector (3 parameters for each movie or user), and it provides a more explanatory result, though the validation loss is a little bit larger.

The recommendation system with bias

We can improve the model by considering the bias of movies and users. (Consider there is some common goodness everyone agree with. Also, some people might tend to give relatively low score to all kinds of movies.) $$ \hat{r}_{mu} = V_u W_m^T + b_u + b_m $$

In [63]:
outwithbias = SGD_with_bias(df_train,df_test, Nepoch = 120, learning_rate = 0.002, Ndf=2)
 train loss: 0.5319081036116992 test loss: 0.5727601040517722
In [64]:
# curve of train loss and validation loss
sns.set_style("whitegrid")
sns.set_context("talk")
plt.plot(outwithbias[4], label = "train loss with bias")
plt.plot(outwithbias[5], label = "test loss with bias")
plt.plot(outnew3[2], label = "train loss without bias")
plt.plot(outnew3[3], label = "test loss without bias")
plt.legend()
plt.xlabel("iteration")
plt.ylabel("MSE loss")
Out[64]:
Text(0, 0.5, 'MSE loss')
In [65]:
movie_factor_bias = pd.DataFrame(outwithbias[0], columns = ["comp_1", "comp_2"])
movie_factor_bias["bias"] = outwithbias[2]
movie_factor_bias["movieId"] = movie_factor_bias.index.map(lambda x: movienew2old[x])
movie_factor_bias["title"] = movie_factor_bias["movieId"].apply(
    lambda x: linkdfpick.title[linkdfpick.movieId == x].values[0])
In [97]:
fig = px.scatter_3d(movie_factor_bias, x="comp_1", y="comp_2", z="bias", hover_name="title",color = "bias")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias3D.html")
IFrame(src='./Bias3D.html', width=800, height=700)
Out[97]:
In [98]:
fig = px.scatter(movie_factor_bias, x="comp_1", y="comp_2", hover_name="title",color = "bias")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias2D.html")
IFrame(src='./Bias2D.html', width=800, height=700)
Out[98]:
In [68]:
imdfidstr = linkdfpick.imdbId.apply(lambda x: "tt{:07d}".format(x))
imdfidstr
Out[68]:
0        tt0114709
1        tt0113497
2        tt0113228
3        tt0114885
4        tt0113041
           ...    
41850    tt3748528
41897    tt1355644
41901    tt4846340
42535    tt5052448
42536    tt3315342
Name: imdbId, Length: 3733, dtype: object
In [69]:
moviedfpick = moviedf[moviedf.imdb_id.isin(imdfidstr)]
#    (moviedf[moviedf.imdb_id == "tt{:07d}".format(x)].values[0][1])
In [70]:
moviedfpick["movieId"] = moviedfpick.imdb_id.apply(
    lambda x: linkdfpick.movieId[linkdfpick.imdbId == int(x[2:])].values[0])
moviedfpick["latent_vec_0"] = moviedfpick.movieId.apply(
    lambda x: movie_factor_bias.comp_1[movie_factor_bias.movieId == x].values[0])
moviedfpick["latent_vec_1"] = moviedfpick.movieId.apply(
    lambda x: movie_factor_bias.comp_2[movie_factor_bias.movieId == x].values[0])
moviedfpick["bias"] = moviedfpick.movieId.apply(
    lambda x: movie_factor_bias.bias[movie_factor_bias.movieId == x].values[0])
In [71]:
moviedfpick["latent_vec_nobias_0"] = moviedfpick.movieId.apply(
    lambda x: movie_factor.comp_1[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_1"] = moviedfpick.movieId.apply(
    lambda x: movie_factor.comp_2[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_2"] = moviedfpick.movieId.apply(
    lambda x: movie_factor.comp_3[movie_factor.movieId == x].values[0])
moviedfpick["latent_vec_nobias_3"] = moviedfpick.movieId.apply(
    lambda x: movie_factor.comp_4[movie_factor.movieId == x].values[0])
In [72]:
moviedfpick.to_csv("movie_recommendation_pick.csv", index = False)
In [99]:
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "bias")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Bias2D_2.html")
IFrame(src='./Bias2D_2.html', width=800, height=700)
Out[99]:
In [100]:
fig = px.scatter(moviedfpick, x="latent_vec_nobias_1", y="latent_vec_nobias_2", hover_name="title",color = "latent_vec_nobias_0")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("NoBias2D_2.html")
IFrame(src='./NoBias2D_2.html', width=800, height=700)
Out[100]:
In [75]:
user_factor_nobias = pd.DataFrame(user_vec_pca, columns = ["comp_1", "comp_2", "comp_3", "comp_4"])
user_factor_nobias["userId"] = user_factor_nobias.index.map(lambda x: usernew2old[x])
user_factor_nobias.head()
Out[75]:
comp_1 comp_2 comp_3 comp_4 userId
0 0.380188 0.189237 0.004801 -0.191643 1
1 0.197828 -0.237817 0.176798 0.300713 2
2 0.317930 -0.098942 0.111594 0.012443 4
3 0.477123 -0.043915 0.077093 0.122621 5
4 0.414061 0.075704 -0.097896 0.442637 7
In [76]:
user_factor_bias = pd.DataFrame(outwithbias[1], columns = ["comp_1", "comp_2"])
user_factor_bias["bias"] = outwithbias[3]
user_factor_bias["userId"] = user_factor_bias.index.map(lambda x: usernew2old[x])
user_factor_bias.head()
Out[76]:
comp_1 comp_2 bias userId
0 -0.128052 -0.101599 0.443831 1
1 0.036008 0.120057 -0.374288 2
2 0.176226 -0.002649 0.003053 4
3 0.246821 -0.099580 0.042324 5
4 0.190903 -0.231788 -0.566056 7
In [77]:
user_factor_nobias.to_csv("user_vec_nobias.csv", index = False)
user_factor_bias.to_csv("user_vec_bias.csv", index = False)

Play with the factors

we can make some interesting plot in the vector space.

In [78]:
moviedfpick.head()
Out[78]:
adult belongs_to_collection budget genres homepage id imdb_id original_language original_title overview ... year month movieId latent_vec_0 latent_vec_1 bias latent_vec_nobias_0 latent_vec_nobias_1 latent_vec_nobias_2 latent_vec_nobias_3
0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... http://toystory.disney.com/toy-story 862 tt0114709 en Toy Story Led by Woody, Andy's toys live happily in his ... ... 1995.0 10.0 1 3.212202 0.358735 0.400000 1.570108 -2.012539 -0.455311 -2.128828
1 False NaN 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... NaN 8844 tt0113497 en Jumanji When siblings Judy and Peter discover an encha... ... 1995.0 12.0 2 0.529560 1.442203 -0.400000 -0.325062 -1.557281 0.428925 -1.733961
2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... NaN 15602 tt0113228 en Grumpier Old Men A family wedding reignites the ancient feud be... ... 1995.0 12.0 3 -0.178935 1.301062 -0.423641 -1.202997 -0.980757 0.091309 -1.853464
3 False NaN 16000000 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... NaN 31357 tt0114885 en Waiting to Exhale Cheated on, mistreated and stepped on, the wom... ... 1995.0 12.0 4 0.365878 1.180831 -0.540973 -1.633516 0.068297 2.646111 -2.011929
4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 [{'id': 35, 'name': 'Comedy'}] NaN 11862 tt0113041 en Father of the Bride Part II Just when George Banks has recovered from his ... ... 1995.0 2.0 5 0.494886 1.956363 -0.352500 -1.584743 -1.505743 1.481863 -1.707286

5 rows × 35 columns

In [89]:
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "year")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Biasvecvsyear.html")
IFrame(src='./Biasvecvsyear.html', width=800, height=700)
In [101]:
fig = px.scatter(moviedfpick, x="latent_vec_0", y="latent_vec_1", hover_name="title",color = "revenue")
fig.update_layout(
    margin=dict(l=30, r=30, t=30, b=30),
)
fig.write_html("Biasvecvsrevenue.html")
IFrame(src='./Biasvecvsrevenue.html', width=800, height=700)
Out[101]:
In [ ]: